home *** CD-ROM | disk | FTP | other *** search
Text File | 2010-02-10 | 27.4 KB | 1,058 lines |
- // DO NOT import this into the global namespace, but instead
- // import it into your own namespace wrapper
-
- var EXPORTED_SYMBOLS = ["DefaultTextExtractor"];
-
- Components.utils.import("resource://glydo/utils/prototype_xul_1_6_0_3_modified.jsm");
- Components.utils.import("resource://glydo/utils/Utils.jsm");
- Components.utils.import("resource://glydo/utils/Prefs.jsm");
-
- DefaultTextExtractor = {};
-
- DefaultTextExtractor.TAG_TYPES = {
- "a": "anchor",
- "applet": "skip",
- "base": "skip",
- "blockquote": "par",
- "br": "line-break",
- "body": "div",
- "caption": "par",
- "code": "skip",
- "dd": "line-break",
- "del": "skip",
- "dir": "par",
- "div": "div",
- "dt": "par-start",
- "dl": "par",
- "embed": "skip",
- "frame": "skip",
- "h1": "header",
- "h2": "header",
- "h3": "header",
- "h4": "header",
- "h5": "header",
- "h6": "header",
- "hr": "line-break",
- "iframe": "skip",
- "img": "skip",
- "li": "par",
- "link": "skip",
- "map": "skip",
- "menu": "par",
- "noframes": "skip",
- "noscript": "skip",
- "object": "skip",
- "ol": "par",
- "p": "par",
- "pre": "par",
- "samp": "skip",
- "script": "skip",
- "select": "skip",
- "style": "skip",
- "table": "div",
- "head": "skip",
- "td": "par",
- "th": "par",
- "var": "skip"
- };
-
- DefaultTextExtractor.STYLE_PROPERTIES = [
-
- ["fontWeight","font-weight"],
- ["fontSize","font-size"]
- ];
-
-
-
- DefaultTextExtractor.Container = Prototype.Class.create({
- initialize: function(depth,separator) {
- this.objects = [];
- this.headerCandidates = [];
- this.depth = depth;
- this.score = null;
- this.nTextWordsCount = null;
- this.nLinkWordsCount = null;
- this.nLocalTextWordsCount = null;
- this.nLocalLinkWordsCount = null;
- if (separator === undefined) {
- this.separator = "";
- } else {
- this.separator = separator;
- }
- },
-
- isLocked: function() {
- return this.nTextWordsCount !== null ||
- this.nLinkWordsCount !== null ||
- this.score !== null;
- },
-
- getTextWordsCount: function() {
- if (this.nTextWordsCount === null) {
- var n = 0;
- this.objects.forEach(function(o) {
- n += o.getTextWordsCount();
- },this);
- this.nTextWordsCount = n;
- }
- return this.nTextWordsCount;
- },
-
- getLocalTextWordsCount: function() {
- if (this.nLocalTextWordsCount === null) {
- var n = 0;
- this.objects.forEach(function(o) {
- if (o instanceof DefaultTextExtractor.Paragraph) {
- n += o.getTextWordsCount();
- }
- },this);
- this.nLocalTextWordsCount = n;
- }
- return this.nLocalTextWordsCount;
- },
-
- getLinkWordsCount: function() {
- if (this.nLinkWordsCount === null) {
- var n = 0;
- this.objects.forEach(function(o) {
- n += o.getLinkWordsCount();
- },this);
- this.nLinkWordsCount = n;
- }
- return this.nLinkWordsCount;
- },
-
- getLocalLinkWordsCount: function() {
- if (this.nLocalLinkWordsCount === null) {
- var n = 0;
- this.objects.forEach(function(o) {
- if (o instanceof DefaultTextExtractor.Paragraph) {
- n += o.getLinkWordsCount();
- }
- },this);
- this.nLocalLinkWordsCount = n;
- }
- return this.nLocalLinkWordsCount;
- },
-
- isEmpty: function() {
- return this.getTextWordsCount() == 0;
- },
-
- add: function(item) {
- if (this.isLocked()) {
- throw "Cannot modify container once scores have been calculated";
- }
- this.objects.push(item);
- },
-
- addHeaderCandidate: function(item) {
- this.headerCandidates.push(item);
- },
-
- shouldPrune: function() {
- return this.isEmpty();
- },
-
- toXml: function(doc,params,name) {
- if (!params || !params.dontPrune) {
- if (this.shouldPrune()) {
- return null;
- }
- }
- if (this.isEmpty()) {
- return null;
- }
- if (name === undefined) {
- name = "container";
- }
- var elem = doc.createElement(name);
- if (this.score != undefined) {
- elem.setAttribute("score", this.score);
- }
- for (var oi = 0; oi < this.objects.length; ++oi) {
- var xml = this.objects[oi].toXml(doc,params);
- if (xml !== null) {
- elem.appendChild(xml);
- }
- }
- return elem;
- },
-
- toText: function() {
- var res = [];
- for (var oi = 0; oi < this.objects.length; ++oi) {
- res.push(this.objects[oi].toText());
- }
- return res.join(this.separator);
- },
-
- containsAnyOf: function(searchStrings) {
- return Utils.containsAnyOf(this.toText(),searchStrings);
- },
-
- calcScore: function() {
- this.score = 0;
- this.objects.forEach(Prototype.F.bind(
- function(o) {
- if (o.calcScore !== undefined) {
- o.calcScore();
- if (!o.shouldPrune()) {
- this.score += o.score;
- }
- }
- },this)
- );
- },
-
- collect: function(predicate,resultList) {
- if (resultList === undefined) {
- resultList = [];
- }
- for (var oi = 0; oi < this.objects.length; ++oi) {
- var o = this.objects[oi];
- if (predicate(o)) {
- resultList.push(o);
- } else if (o.collect !== undefined) {
- o.collect(predicate,resultList);
- }
- }
- return resultList;
- },
-
- collectHeaderCandidates: function(resultList) {
- if (resultList === undefined) {
- resultList = [];
- }
- for (var i = 0; i < this.headerCandidates.length; ++i) {
- resultList.push(this.headerCandidates[i]);
- }
- for (var oi = 0; oi < this.objects.length; ++oi) {
- var o = this.objects[oi];
- if (o instanceof DefaultTextExtractor.Container) {
- o.collectHeaderCandidates(resultList);
- }
- }
- return resultList;
- },
-
- collectLocal: function(predicate,resultList) {
- if (resultList === undefined) {
- resultList = [];
- }
- for (var oi = 0; oi < this.objects.length; ++oi) {
- var o = this.objects[oi];
- if (predicate(o)) {
- resultList.push(o);
- }
- }
- return resultList;
- }
- });
-
- DefaultTextExtractor.Text = Prototype.Class.create({
- initialize: function(container,parentElement,text,isLink,remLeadingSpaces) {
- this.container = container;
- this.text = "";
- this.isLink = isLink;
- this.remLeadingSpaces = remLeadingSpaces;
- this.storeStyle(parentElement);
- this.addText(text);
- },
-
- setRemLeadingSpaces: function(remLeadingSpaces) {
- this.remLeadingSpaces = remLeadingSpaces;
- },
-
- storeStyle: function(element) {
- var style = {};
- DefaultTextExtractor.STYLE_PROPERTIES.forEach(function(p) {
- var s = Prototype.E.getStyle(element,p[1]);
- if (s !== null) {
- style[p[0]] = s;
- }
- },this);
- this.style = style;
- },
-
- isCompatibleWith: function(otherText) {
- if (otherText.isLink !== this.isLink) {
- return false;
- }
- return this.stylesCompatible(this.style,otherText.style);
- },
-
- addText: function(text) {
- this.text += text;
- var words = this.text.split(/\s+/);
- var n = 0;
- for (var i = 0; i < words.length; ++i) {
- if (words[i].length != 0) {
- n++;
- }
- }
- this.nWords = n;
- this.strippedText = this.text.replace(/\s+/g," ");
- if (this.remLeadingSpaces) {
- this.lstrip();
- }
- var fontSize = Utils.getPixelsFromStyleSizeStr(this.style["fontSize"]);
- if (fontSize >= 16 && this.style["fontWeight"] == "bold") {
- this.container.addHeaderCandidate(this);
- }
- },
-
- rstrip: function() {
- var l = this.strippedText.length;
- if ((l > 0) && (this.strippedText.charAt(l-1) === ' ')) {
- this.strippedText = this.strippedText.substring(0,l-1);
- }
- },
-
- lstrip: function() {
- var l = this.strippedText.length;
- if ((l > 0) && (this.strippedText.charAt(0) === ' ')) {
- this.strippedText = this.strippedText.substring(1);
- }
- },
-
- toText: function() {
- return this.strippedText;
- },
-
- // TODO: What about locking of text objects?
- getTextWordsCount: function() {
- return this.nWords;
- },
-
- getLinkWordsCount: function() {
- return this.isLink ? this.nWords : 0;
- },
-
- shouldPrune: function() {
- return false;
- },
-
- toXml: function(doc,params) {
- var text = doc.createTextNode(this.toText());
- var elem = null;
- if (this.isLink) {
- elem = doc.createElement("a");
- } else if (params && params.styleInfo) {
- elem = doc.createElement("span");
- }
- if (elem) {
- elem.appendChild(text);
- if (params && params.styleInfo) {
- params.styles = params.styles || [];
- var foundStyle = Prototype.A.find(params.styles,function(prevStyle) {
- return this.stylesCompatible(prevStyle.values,this.style);
- },this);
- if (!foundStyle) {
- var n = params.styles.length;
- foundStyle = {
- id: n,
- values: this.style
- };
- params.styles.push(foundStyle);
- }
- elem.setAttribute("style",foundStyle.id);
- }
- return elem;
- }
- return text;
- },
-
- stylesCompatible: function(styleA,styleB) {
- var p;
- for (p in styleA) {
- if (styleA[p] !== styleB[p]) {
- return false;
- }
- }
- for (p in styleB) {
- if (styleA[p] !== styleB[p]) {
- return false;
- }
- }
- return true;
- }
- });
-
- DefaultTextExtractor.Paragraph = Prototype.Class.create(DefaultTextExtractor.Container,{
-
- initialize: function($super,containingBlock,depth) {
- $super(depth);
- this.containingBlock = containingBlock;
- this.nLinks = 0;
- this.inLink = false;
- },
-
- addText: function(parentElement,text) {
- if (this.isLocked()) {
- throw "Cannot modify paragraph once scores have been calculated";
- }
- if (text !== undefined) {
- var newText = new DefaultTextExtractor.Text(this, parentElement, text, this.inLink, false);
- var nObjects = this.objects.length;
- var prevSpace = true;
- var remLeadingSpaces = true;
- if (nObjects > 0) {
- var lastText = this.objects[nObjects-1];
- if ((lastText instanceof DefaultTextExtractor.Text) &&
- lastText.isCompatibleWith(newText)) {
- lastText.addText(text);
- return;
- }
- var t = lastText.toText();
- if (t.length == 0 || !Prototype.S.blank(t.substring(t.length-1))) {
- prevSpace = false;
- remLeadingSpaces = false;
- }
- // If the previous text ended with a blank and the
- // current text begins with a blank, remove one of them
- if (prevSpace && Prototype.S.blank(text.charAt(0))) {
- // If the last text is a link, remove the space from that
- if (lastText.isLink) {
- lastText.rstrip();
- remLeadingSpaces = false;
- }
- }
- }
- newText.setRemLeadingSpaces(remLeadingSpaces);
- this.add(newText);
- }
- },
-
- trimTrailingSpaces: function() {
- if (this.isLocked()) {
- throw "Cannot modify container once scores have been calculated";
- }
- var nObjects = this.objects.length;
- prevSpace = true;
- if (nObjects > 0) {
- var last = this.objects[nObjects-1];
- last.rstrip();
- }
- },
-
- close: function() {
- if (this.isLocked()) {
- throw "Cannot modify container once scores have been calculated";
- }
- this.trimTrailingSpaces();
- },
-
- openLink: function() {
- if (this.isLocked()) {
- throw "Cannot modify container once scores have been calculated";
- }
- this.nLinks++;
- this.inLink = true;
- },
-
- closeLink: function() {
- if (this.isLocked()) {
- throw "Cannot modify container once scores have been calculated";
- }
- this.inLink = false;
- },
-
- addLineBreak: function() {
- if (this.isLocked()) {
- throw "Cannot modify container once scores have been calculated";
- }
- this.trimTrailingSpaces();
- this.add(new DefaultTextExtractor.LineBreak());
- },
-
- toXml: function($super,doc,params,name) {
- if (name === undefined) {
- name = "p";
- }
- return $super(doc,params,name);
- },
-
- shouldPrune: function($super) {
- return $super() || this.score <= -5;
- },
-
- calcScore: function() {
- this.score = 0;
- var ntw = this.getTextWordsCount();
- var nlw = this.getLinkWordsCount();
- if (ntw > 0) {
- this.score -= 10*nlw/ntw;
- }
- // Paragraph contains too many separate links (from daled)
- if (ntw > 4 && nlw > 2) {
- this.score -= 10*nlw/ntw;
- }
- this.score += (ntw - nlw)/4.0;
- }
- });
-
- DefaultTextExtractor.LineBreak = Prototype.Class.create({
- getTextWordsCount: function() {
- return 0;
- },
-
- getLinkWordsCount: function() {
- return 0;
- },
-
- toText: function() {
- return "\n";
- },
-
- toXml: function(doc,params) {
- return doc.createElement("br");
- },
-
- rstrip: function () {
- },
-
- lstrip: function() {
- }
-
- });
-
- DefaultTextExtractor.Header = Prototype.Class.create(DefaultTextExtractor.Paragraph,{
- initialize: function($super,containingBlock,depth,level) {
- $super(containingBlock,depth);
- this.level = level;
- },
-
- calcScore: function($super) {
- if (this.level == 1) {
- var ntw = this.getTextWordsCount();
- var nlw = this.getLinkWordsCount();
- this.score = 5 + (ntw+nlw)/2.0;
- } else {
- $super();
- }
- },
-
- toXml: function($super,doc,params,name) {
- if (name === undefined) {
- name = "h";
- }
- var elem = $super(doc,params,name);
- if (elem !== null) {
- elem.setAttribute("level", this.level);
- }
- return elem;
- }
- });
-
-
- DefaultTextExtractor.Block = Prototype.Class.create(DefaultTextExtractor.Container,{
- initialize: function($super,depth,element,position,scrollPosition,rootDimensions) {
- $super(depth,'\n\n');
- this.maxScore = null;
- this.rootDimensions = rootDimensions;
- this.dimensions = null;
- this.position = null;
- this.visible = null;
- this.nodes = {}
- this.notInteresting = false;
- if (element) {
- this.dimensions = Prototype.E.getDimensions(element);
- this.position = position || Prototype.E.cumulativeOffset(element);
- // this is "1" because the first block holds the real width and height
- if (depth == 1) {
- this.rootDimensions = this.dimensions;
- }
- }
- this.punish = this.calcPunishment();
- if (this.punish > 3) {
- this.notInteresting = true;
- }
- this.openPar();
- },
-
- getBestBlock: function() {
- this.maxScore = 0;
- var bestBlock = null;
- var descBestBlock = null;
- for (var iObject = 0; iObject < this.objects.length; ++iObject) {
- var object = this.objects[iObject];
- if (!(object instanceof DefaultTextExtractor.Block)) {
- continue;
- }
- descBestBlock = object.getBestBlock();
- if (descBestBlock !== null && this.maxScore < descBestBlock.maxScore) {
- bestBlock = descBestBlock;
- this.maxScore = descBestBlock.maxScore;
- }
- }
- var score = this.calcLocalScore();
- if (bestBlock === null || score > this.maxScore) {
- bestBlock = this;
- this.maxScore = score;
- }
- return bestBlock;
- },
-
- calcLocalScore: function() {
- var nAllWords = this.getLocalTextWordsCount();
- var nLinkWords = this.getLocalLinkWordsCount();
- var nOnlyTextWords = nAllWords - nLinkWords;
- var score = 0;
- if (nLinkWords + nOnlyTextWords != 0) {
- score = nOnlyTextWords / (nLinkWords + nOnlyTextWords) * Math.sqrt(nOnlyTextWords);
- }
- score = score/this.punish;
-
- return score;
- },
-
- calcPunishment: function() {
- var punish = 1;
- // punish divs that don't start at initial top view
- if (this.position.top > 800) {
- var factor = (this.position.top - 800) / 100 + 1;
- if (factor > 5) {
- factor = 5;
- }
- punish = punish * factor;
- }
- if (this.rootDimensions !== undefined) {
- // punish divs with width < 30%
- if (this.dimensions.width < this.rootDimensions.width*0.3) {
- // factor - between 1 and 2.5
- var factor = (this.rootDimensions.width*0.3 - this.dimensions.width) / this.rootDimensions.width * 5 + 1;
- if (factor > 3) {
- factor = 3
- }
- punish = punish * factor;
- }
- }
- return punish;
- },
-
- openBlock: function(element,elementId,position,scrollPosition) {
- var b = new DefaultTextExtractor.Block(this.depth+1,element,position,scrollPosition,this.rootDimensions);
- this.add(b);
- this.nodes[elementId] = b;
- return b;
- },
-
- getBlockByElement: function(elementId) {
- return this.nodes[elementId];
- },
-
- addLineBreak: function() {
- this.curPar.addLineBreak();
- },
-
- addText: function(parentElement,text) {
- this.curPar.addText(parentElement,text);
- },
-
- openLink: function() {
- this.curPar.openLink();
- },
-
- closeLink: function() {
- this.curPar.closeLink();
- },
-
- getNormalParagraphs: function(paragraphDisqualifyingWords) {
- return this.collectLocal(function(o) {
- if (
- (o instanceof DefaultTextExtractor.Paragraph) &&
- !(o instanceof DefaultTextExtractor.Header)) {
- return !o.containsAnyOf(paragraphDisqualifyingWords);
- }
- });
- },
-
- openHeader: function(level) {
- this.curPar = new DefaultTextExtractor.Header(this, this.depth+1, level);
- this.add(this.curPar);
- if (level <= 2) {
- this.addHeaderCandidate(this.curPar);
- }
- },
-
- openPar: function() {
- this.curPar = new DefaultTextExtractor.Paragraph(this, this.depth+1);
- this.add(this.curPar);
- },
-
- closePar: function() {
- this.curPar.close();
- this.curPar = new DefaultTextExtractor.Paragraph(this, this.depth+1);
- this.add(this.curPar);
- },
-
- shouldPrune: function($super) {
- return $super() || this.score < 3;
- },
-
- toXml: function ($super,doc,params,name) {
- if (name === undefined) {
- name = "block";
- }
- var elem = $super(doc,params,name);
- if (elem) {
- if (!this.display) {
- elem.setAttribute("d","0");
- }
- if (this.dimensions) {
- elem.setAttribute("w",this.dimensions.width);
- elem.setAttribute("h",this.dimensions.height);
- }
- if (this.position) {
- elem.setAttribute("l",this.position.left);
- elem.setAttribute("t",this.position.top);
- }
- if (this.viewportPosition) {
- elem.setAttribute("vl",this.viewportPosition.left);
- elem.setAttribute("vt",this.viewportPosition.top);
- }
- if (this.viewportDimensions) {
- elem.setAttribute("viewWidth",this.viewportDimensions.width);
- elem.setAttribute("viewHeight",this.viewportDimensions.height);
- }
- }
- return elem;
- }
- });
-
- DefaultTextExtractor.ExtractionTask = Prototype.Class.create({
-
- initialize: function(doc,url,options,callbacks) {
- this.doc = doc;
- this.url = url;
- this.options = options || {};
- this.callNumber = 0;
- this.filtered = {};
- this.callbacks = callbacks || {};
- this.initParams();
- },
-
- parseDocumentNode: function(block, node, offsetParentPosition, parentScrollPosition) {
- if (this.isNodeStatusDone(node)) {
- return true;
- }
- if (block.notInteresting || node === null || node.nodeType != Components.interfaces.nsIDOMNode.ELEMENT_NODE) {
- this.markNodeStatusAsDone(node);
- return true;
- }
-
- var now = new Date();
- if (now.getTime() - this.phaseStart.getTime() > this.taskPhaseMax) {
- return false;
- }
-
- var started = false;
- if (this.isNodeStatusStarted(node)) {
- started = true;
- } else {
- this.markNodeStatusAsStarted(node);
- }
-
- var position = null;
- var scrollPosition = null;
-
- var tag_type = DefaultTextExtractor.TAG_TYPES[node.nodeName.toLowerCase()];
-
- if (tag_type == "skip") {
- this.markNodeStatusAsDone(node);
- return true;
- } else if (tag_type == "anchor") {
- if (!started) {
- block.openLink();
- }
- if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
- return false;
- }
- block.closeLink();
- } else if (tag_type == "ignore") {
- if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
- return false;
- }
- } else if (tag_type == "par-start") {
- if (!started) {
- block.openPar();
- }
- if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
- return false;
- }
- } else if (tag_type == "par") {
- if (!started) {
- block.openPar();
- }
- if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
- return false;
- }
- block.closePar();
- } else if (tag_type == "line-break") {
- if (!started) {
- block.addLineBreak();
- }
- if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
- return false;
- }
- } else if (tag_type == "header") {
- var level = node.nodeName[1];
- if (!started) {
- block.openHeader(level);
- }
- if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
- return false;
- }
- block.closePar();
- } else if (tag_type == "div") {
- if (!started) {
- var nodeId = this.setNodeId(node);
- var newBlock = block.openBlock(node,nodeId,position,scrollPosition);
- block = newBlock;
- } else {
- var nodeId = this.getNodeId(node);
- if (!nodeId) {
- nodeId = this.setNodeId(node);
- }
- block = block.getBlockByElement(nodeId);
- if (!block) {
- var newBlock = block.openBlock(node,nodeId,position,scrollPosition);
- block = newBlock;
- }
- }
- if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
- return false;
- }
- } else {
- if (!this.parseDocumentNodeChildren(block, node, offsetParentPosition, scrollPosition)) {
- return false;
- }
- }
- this.markNodeStatusAsDone(node);
- return true;
- },
-
- parseDocumentNodeChildren: function(block, node, basePositionForChildren, scrollPosition) {
- var children = node.childNodes;
- for (var ci = 0; ci < children.length; ++ci) {
- var child = children[ci];
- switch (child.nodeType) {
- case Components.interfaces.nsIDOMNode.ELEMENT_NODE:
- if (!this.parseDocumentNode(block,child,basePositionForChildren,scrollPosition)) {
- return false;
- }
- break;
- case Components.interfaces.nsIDOMNode.TEXT_NODE:
- case Components.interfaces.nsIDOMNode.CDATA_SECTION_NODE:
- if (!this.isNodeStatusDone(child)) {
- block.addText(node,child.nodeValue);
- this.markNodeStatusAsDone(child);
- }
- break;
- }
- }
- return true;
- },
-
- markNodeStatusAsDone: function(node) {
- node.setUserData("glydoParsedNodeStatus","DONE",null);
- },
-
- markNodeStatusAsStarted: function(node) {
- node.setUserData("glydoParsedNodeStatus","STARTED",null);
- },
-
- isNodeStatusDone: function(node) {
- return node.getUserData("glydoParsedNodeStatus") == "DONE";
- },
-
- isNodeStatusStarted: function(node) {
- return node.getUserData("glydoParsedNodeStatus") == "STARTED";
- },
-
- setNodeId: function(node) {
- var id = Utils.uuid1();
- node.setUserData("glydoUniqueId",id,null);
- return id;
- },
-
- getNodeId: function(node) {
- return node.getUserData("glydoUniqueId");
- },
-
- calcHeaderProbability: function(bestBlock, candidate, pageTitle) {
- if (Prototype.S.strip(candidate.toText()).length == 0) {
- return 0;
- }
- var score = 1;
- var containingBlock = null;
- // give higher score to h1 or h2
- if (candidate instanceof DefaultTextExtractor.Header) {
- score *= 4-candidate.level;
- containingBlock = candidate.containingBlock;
- } else {
- // give higher score to larger fonts
- var fontSize = Utils.getPixelsFromStyleSizeStr(candidate.style["fontSize"]);
- score *= fontSize / 16.0;
- containingBlock = candidate.container.containingBlock;
- }
- if (containingBlock.notInteresting) {
- return 0;
- }
- var text = Prototype.S.strip(candidate.toText());
- // if the candidate is a part of the page title it gives it a serious boost...
- if (pageTitle !== null && (pageTitle.indexOf(text) != -1) && text.length > 10) {
- score *= 3;
- }
- if (containingBlock == bestBlock) {
- score *= 2;
- } else {
- // not too close to the top
- if (containingBlock.position.top < 50) {
- score /= 2;
- }
- // not too far down from the article (even though a little down is ok, cause there can be another
- // block in between)
- if (containingBlock.position.top - bestBlock.position.top > 100) {
- score /= 3;
- }
- // not too to the side from the article
- var horizDiff = containingBlock.position.left - bestBlock.position.left;
- if (horizDiff > 50 || horizDiff < -200) {
- score /= 3;
- }
- }
- return score;
- },
-
- extractHeader: function(block, bestBlock, pageTitle) {
- var headerList = block.collectHeaderCandidates();
- var maxHeaderScore = 0;
- var header = null;
- if (headerList !== null) {
- for (var hi = 0; hi < headerList.length; ++hi) {
- if (!(headerList[hi] instanceof DefaultTextExtractor.Header) &&
- !(headerList[hi] instanceof DefaultTextExtractor.Text)) {
- continue;
- }
- var headerScore = this.calcHeaderProbability(bestBlock, headerList[hi], pageTitle);
- if (headerScore > maxHeaderScore) {
- maxHeaderScore = headerScore;
- header = headerList[hi];
- }
- }
- }
-
- // If headline and page title match, we indicate a high certainty
- if ((pageTitle !== null) && (header !== null) &&
- (pageTitle.indexOf(header.toText()) != -1)) {
- }
- return ({header: header, headerCertainty: maxHeaderScore});
- },
-
- extractNormalParagraphs: function(block) {
-
- // Get list of non-header texts
- paraList = block.getNormalParagraphs(this.paragraphDisqualifyingWords);
- return paraList;
- },
-
- extractNormalParagraphsAsXml: function(block,parent) {
- var paraList = this.extractNormalParagraphs(block);
- for (var ti = 0; ti < paraList.length; ++ti) {
- var x = paraList[ti].toXml(parent.ownerDocument)
- if (x !== null) {
- parent.appendChild(x);
- }
- }
- },
-
- getContextItem: function(doc,rootBlock, bestBlock, pageTitle) {
- var item = {
- "@type": "text",
- title: {
- "__content": ""
- },
- body: {
- "__content": ""
- },
- };
- var ht = this.extractHeader(rootBlock, bestBlock, pageTitle);
- item.title["@certainty"] = ht.headerCertainty;
- if (ht.header !== null) {
- item.title["__content"] = ht.header.toText();
- }
-
- var textDoc = doc.implementation.createDocument("","",null);
- var body = textDoc.createDocumentFragment();
- textDoc.appendChild(body);
- this.extractNormalParagraphsAsXml(bestBlock,body);
- item.body["@score"] = bestBlock.maxScore;
- item.body["__content"] = body;
-
- return item;
- },
-
- initParams: function() {
- var defaults = ({
- paragraphDisqualifyingWords:
- [
- "\u00A9",
- "registered trademark",
- "all rights reserved",
- ["inappropriate", "comments"],
- ["profanity", "comments"],
- ["terms", "conditions"]
- ],
- taskPhaseMax: Prefs.task_phase_max,
- totalTaskMax: Prefs.total_task_max,
- taskBreak: Prefs.task_break,
- });
- this.setParameter("paragraphDisqualifyingWords",this.options,defaults);
- this.setParameter("taskPhaseMax",this.options,defaults);
- this.setParameter("totalTaskMax",this.options,defaults);
- this.setParameter("taskBreak",this.options,defaults);
- },
-
- setParameter: function(name,params,defaults) {
- var p = params[name];
- if (p === undefined) {
- p = defaults[name];
- }
- this[name] = p;
- },
-
- execute: function() {
- this.callNumber++;
- if (!this.rootBlock) {
- this.taskStart = new Date();
- this.rootBlock = new DefaultTextExtractor.Block(0,this.doc.documentElement);
- }
- this.phaseStart = new Date();
- if (!this.parseDocumentNode(this.rootBlock, this.doc.documentElement)) {
- var after = new Date();
- if (after.getTime() - this.taskStart.getTime() > this.totalTaskMax) {
- if (this.callbacks["notifyTaskFailed"]) {
- this.callbacks["notifyTaskFailed"]("Text extraction took too much");
- }
- } else {
- this.doc.defaultView.setTimeout(Prototype.F.bind(this.execute, this), this.taskBreak);
- }
- } else {
- var after = new Date();
-
- var bestBlock = this.rootBlock.getBestBlock();
-
- var item = this.getContextItem(this.doc,this.rootBlock, bestBlock, this.doc.title);
- if (this.callbacks["notifyContextItemExtracted"]) {
- this.callbacks["notifyContextItemExtracted"](item);
- }
- if (this.callbacks["notifyTaskDone"]) {
- this.callbacks["notifyTaskDone"]();
- }
- }
- },
-
- });
-